{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text Processing\n",
    "\n",
    "## Capturing Text Data\n",
    "\n",
    "### Plain Text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hieroglyphic writing dates from c. 3000 BC, and is composed of hundreds of symbols. A hieroglyph can represent a word, a sound, or a silent determinative; and the same symbol can serve different purposes in different contexts. Hieroglyphs were a formal script, used on stone monuments and in tombs, that could be as detailed as individual works of art.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "# Read in a plain text file\n",
    "with open(os.path.join(\"data\", \"hieroglyph.txt\"), \"r\") as f:\n",
    "    text = f.read()\n",
    "    print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tabular Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>publisher</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Livemint</td>\n",
       "      <td>fed's charles plosser sees high bar for change...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>IFA Magazine</td>\n",
       "      <td>us open: stocks fall after fed official hints ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>IFA Magazine</td>\n",
       "      <td>fed risks falling 'behind the curve', charles ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Moneynews</td>\n",
       "      <td>fed's plosser: nasty weather has curbed job gr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NASDAQ</td>\n",
       "      <td>plosser: fed may have to accelerate tapering pace</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      publisher                                              title\n",
       "0      Livemint  fed's charles plosser sees high bar for change...\n",
       "1  IFA Magazine  us open: stocks fall after fed official hints ...\n",
       "2  IFA Magazine  fed risks falling 'behind the curve', charles ...\n",
       "3     Moneynews  fed's plosser: nasty weather has curbed job gr...\n",
       "4        NASDAQ  plosser: fed may have to accelerate tapering pace"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Extract text column from a dataframe\n",
    "df = pd.read_csv(os.path.join(\"data\", \"news.csv\"))\n",
    "df.head()[['publisher', 'title']]\n",
    "\n",
    "# Convert text column to lowercase\n",
    "df['title'] = df['title'].str.lower()\n",
    "df.head()[['publisher', 'title']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Online Resource"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    \"success\": {\n",
      "        \"total\": 1\n",
      "    },\n",
      "    \"contents\": {\n",
      "        \"quotes\": [\n",
      "            {\n",
      "                \"quote\": \"When you win, say nothing. When you lose, say less.\",\n",
      "                \"author\": \"Paul Brown\",\n",
      "                \"length\": \"51\",\n",
      "                \"tags\": [\n",
      "                    \"inspire\",\n",
      "                    \"losing\",\n",
      "                    \"running\",\n",
      "                    \"winning\"\n",
      "                ],\n",
      "                \"category\": \"inspire\",\n",
      "                \"title\": \"Inspiring Quote of the day\",\n",
      "                \"date\": \"2018-05-09\",\n",
      "                \"id\": null\n",
      "            }\n",
      "        ],\n",
      "        \"copyright\": \"2017-19 theysaidso.com\"\n",
      "    }\n",
      "}\n",
      "When you win, say nothing. When you lose, say less. \n",
      "-- Paul Brown\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "import json\n",
    "\n",
    "# Fetch data from a REST API\n",
    "r = requests.get(\n",
    "    \"https://quotes.rest/qod.json\")\n",
    "res = r.json()\n",
    "print(json.dumps(res, indent=4))\n",
    "\n",
    "# Extract relevant object and field\n",
    "q = res[\"contents\"][\"quotes\"][0]\n",
    "print(q[\"quote\"], \"\\n--\", q[\"author\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<html op=\"news\"><head><meta name=\"referrer\" content=\"origin\"><meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\"><link rel=\"stylesheet\" type=\"text/css\" href=\"news.css?IE5MesKJj5PfEtg18JAO\">\n",
      "            <link rel=\"shortcut icon\" href=\"favicon.ico\">\n",
      "          <link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS\" href=\"rss\">\n",
      "        <title>Hacker News</title></head><body><center><table id=\"hnmain\" border=\"0\" cellpadding=\"0\" cellspacing=\"0\" width=\"85%\" bgcolor=\"#f6f6ef\">\n",
      "        <tr><td bgcolor=\"#ff6600\"><table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" width=\"100%\" style=\"padding:2px\"><tr><td style=\"width:18px;padding-right:4px\"><a href=\"https://news.ycombinator.com\"><img src=\"y18.gif\" width=\"18\" height=\"18\" style=\"border:1px white solid;\"></a></td>\n",
      "                  <td style=\"line-height:12pt; height:10px;\"><span class=\"pagetop\"><b class=\"hnname\"><a href=\"news\">Hacker News</a></b>\n",
      "              <a href=\"newest\">new</a> | <a href=\"newcomments\">comments</a> | <a href=\"show\">show</a> | <a href=\"ask\">ask</a> | <a href=\"jobs\">jobs</a> | <a href=\"submit\">submit</a>            </span></td><td style=\"text-align:right;padding-right:4px;\"><span class=\"pagetop\">\n",
      "                              <a href=\"login?goto=news\">login</a>\n",
      "                          </span></td>\n",
      "              </tr></table></td></tr>\n",
      "<tr style=\"height:10px\"></tr><tr><td><table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" class=\"itemlist\">\n",
      "              <tr class='athing' id='17022963'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">1.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17022963' href='vote?id=17022963&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://ai.googleblog.com/2018/05/duplex-ai-system-for-natural-conversation.html\" class=\"storylink\">Google Duplex: An AI System for Accomplishing Real World Tasks Over the Phone</a><span class=\"sitebit comhead\"> (<a href=\"from?site=googleblog.com\"><span class=\"sitestr\">googleblog.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17022963\">1023 points</span> by <a href=\"user?id=ivank\" class=\"hnuser\">ivank</a> <span class=\"age\"><a href=\"item?id=17022963\">9 hours ago</a></span> <span id=\"unv_17022963\"></span> | <a href=\"hide?id=17022963&amp;goto=news\">hide</a> | <a href=\"item?id=17022963\">432&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17026822'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">2.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17026822' href='vote?id=17026822&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.cs.utexas.edu/users/EWD/ewd03xx/EWD316.PDF\" class=\"storylink\">A Short Introduction to the Art of Programming – Edsgar W. Dijkstra [pdf]</a><span class=\"sitebit comhead\"> (<a href=\"from?site=utexas.edu\"><span class=\"sitestr\">utexas.edu</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17026822\">27 points</span> by <a href=\"user?id=Rescis\" class=\"hnuser\">Rescis</a> <span class=\"age\"><a href=\"item?id=17026822\">1 hour ago</a></span> <span id=\"unv_17026822\"></span> | <a href=\"hide?id=17026822&amp;goto=news\">hide</a> | <a href=\"item?id=17026822\">4&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17024637'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">3.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17024637' href='vote?id=17024637&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://techcrunch.com/2018/05/08/you-can-now-run-linux-apps-on-chrome-os/\" class=\"storylink\">You can now run Linux apps on Chrome OS</a><span class=\"sitebit comhead\"> (<a href=\"from?site=techcrunch.com\"><span class=\"sitestr\">techcrunch.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17024637\">357 points</span> by <a href=\"user?id=willsinclair\" class=\"hnuser\">willsinclair</a> <span class=\"age\"><a href=\"item?id=17024637\">6 hours ago</a></span> <span id=\"unv_17024637\"></span> | <a href=\"hide?id=17024637&amp;goto=news\">hide</a> | <a href=\"item?id=17024637\">153&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17020285'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">4.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17020285' href='vote?id=17020285&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://tcrf.net/The_Cutting_Room_Floor\" class=\"storylink\">The Cutting Room Floor: Unearthing Unused Content from Video Games</a><span class=\"sitebit comhead\"> (<a href=\"from?site=tcrf.net\"><span class=\"sitestr\">tcrf.net</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17020285\">64 points</span> by <a href=\"user?id=indescions_2018\" class=\"hnuser\">indescions_2018</a> <span class=\"age\"><a href=\"item?id=17020285\">5 hours ago</a></span> <span id=\"unv_17020285\"></span> | <a href=\"hide?id=17020285&amp;goto=news\">hide</a> | <a href=\"item?id=17020285\">7&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17026490'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">5.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17026490' href='vote?id=17026490&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://ambrevar.bitbucket.io/emacs-eshell/\" class=\"storylink\">Eshell as a main shell</a><span class=\"sitebit comhead\"> (<a href=\"from?site=bitbucket.io\"><span class=\"sitestr\">bitbucket.io</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17026490\">21 points</span> by <a href=\"user?id=taeric\" class=\"hnuser\">taeric</a> <span class=\"age\"><a href=\"item?id=17026490\">3 hours ago</a></span> <span id=\"unv_17026490\"></span> | <a href=\"hide?id=17026490&amp;goto=news\">hide</a> | <a href=\"item?id=17026490\">1&nbsp;comment</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17022764'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">6.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17022764' href='vote?id=17022764&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.yubico.com/2018/04/yubico-and-microsoft-introduce-passwordless-login/\" class=\"storylink\">Yubico and Microsoft Introduce Passwordless Login</a><span class=\"sitebit comhead\"> (<a href=\"from?site=yubico.com\"><span class=\"sitestr\">yubico.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17022764\">183 points</span> by <a href=\"user?id=guitarbill\" class=\"hnuser\">guitarbill</a> <span class=\"age\"><a href=\"item?id=17022764\">9 hours ago</a></span> <span id=\"unv_17022764\"></span> | <a href=\"hide?id=17022764&amp;goto=news\">hide</a> | <a href=\"item?id=17022764\">138&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17019719'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">7.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17019719' href='vote?id=17019719&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://github.com/objecthub/swift-lispkit#swift-lispkit\" class=\"storylink\">LispKit: framework for Lisp-based extension/scripting languages for macOS apps</a><span class=\"sitebit comhead\"> (<a href=\"from?site=github.com\"><span class=\"sitestr\">github.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17019719\">41 points</span> by <a href=\"user?id=ingve\" class=\"hnuser\">ingve</a> <span class=\"age\"><a href=\"item?id=17019719\">4 hours ago</a></span> <span id=\"unv_17019719\"></span> | <a href=\"hide?id=17019719&amp;goto=news\">hide</a> | <a href=\"item?id=17019719\">3&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17025627'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">8.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17025627' href='vote?id=17025627&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://dynimize.com/\" class=\"storylink\">Dynimize: Speed Up MySQL with CPU Performance Virtualization</a><span class=\"sitebit comhead\"> (<a href=\"from?site=dynimize.com\"><span class=\"sitestr\">dynimize.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17025627\">49 points</span> by <a href=\"user?id=nwrk\" class=\"hnuser\">nwrk</a> <span class=\"age\"><a href=\"item?id=17025627\">5 hours ago</a></span> <span id=\"unv_17025627\"></span> | <a href=\"hide?id=17025627&amp;goto=news\">hide</a> | <a href=\"item?id=17025627\">13&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17024245'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">9.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17024245' href='vote?id=17024245&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.blog.google/products/android/android-p/\" class=\"storylink\">Android P</a><span class=\"sitebit comhead\"> (<a href=\"from?site=blog.google\"><span class=\"sitestr\">blog.google</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17024245\">179 points</span> by <a href=\"user?id=alanfranzoni\" class=\"hnuser\">alanfranzoni</a> <span class=\"age\"><a href=\"item?id=17024245\">7 hours ago</a></span> <span id=\"unv_17024245\"></span> | <a href=\"hide?id=17024245&amp;goto=news\">hide</a> | <a href=\"item?id=17024245\">145&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17025612'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">10.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17025612' href='vote?id=17025612&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://zerobin.net/?9c76eb78e61f7138#+VAz8M7SHT3oZmSj7PSyr8zreoL3sRX5diPUvGO1uxE=\" class=\"storylink\">ls | grep “echo ${data}” – Why/how does this work?</a><span class=\"sitebit comhead\"> (<a href=\"from?site=zerobin.net\"><span class=\"sitestr\">zerobin.net</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17025612\">40 points</span> by <a href=\"user?id=indigodaddy\" class=\"hnuser\">indigodaddy</a> <span class=\"age\"><a href=\"item?id=17025612\">5 hours ago</a></span> <span id=\"unv_17025612\"></span> | <a href=\"hide?id=17025612&amp;goto=news\">hide</a> | <a href=\"item?id=17025612\">28&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17020944'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">11.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17020944' href='vote?id=17020944&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://nickjanetakis.com/blog/a-recycled-ip-address-caused-me-to-pirate-390000-books-by-accident\" class=\"storylink\">A Recycled IP Address Caused Me to Pirate Books by Accident</a><span class=\"sitebit comhead\"> (<a href=\"from?site=nickjanetakis.com\"><span class=\"sitestr\">nickjanetakis.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17020944\">213 points</span> by <a href=\"user?id=nickjj\" class=\"hnuser\">nickjj</a> <span class=\"age\"><a href=\"item?id=17020944\">13 hours ago</a></span> <span id=\"unv_17020944\"></span> | <a href=\"hide?id=17020944&amp;goto=news\">hide</a> | <a href=\"item?id=17020944\">78&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17019365'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">12.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17019365' href='vote?id=17019365&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"http://www.isreview.org/issues/24/anarchists_spain.shtml\" class=\"storylink\">Anarchists in the Spanish Civil War (2002)</a><span class=\"sitebit comhead\"> (<a href=\"from?site=isreview.org\"><span class=\"sitestr\">isreview.org</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17019365\">40 points</span> by <a href=\"user?id=dgarceran\" class=\"hnuser\">dgarceran</a> <span class=\"age\"><a href=\"item?id=17019365\">5 hours ago</a></span> <span id=\"unv_17019365\"></span> | <a href=\"hide?id=17019365&amp;goto=news\">hide</a> | <a href=\"item?id=17019365\">11&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17018851'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">13.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17018851' href='vote?id=17018851&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://medium.com/lapsed-historian/the-long-way-round-the-plane-that-accidentally-circumnavigated-the-world-c04ca734c6bb\" class=\"storylink\">A Plane That Accidentally Circumnavigated the World (2014)</a><span class=\"sitebit comhead\"> (<a href=\"from?site=medium.com\"><span class=\"sitestr\">medium.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17018851\">180 points</span> by <a href=\"user?id=SeoxyS\" class=\"hnuser\">SeoxyS</a> <span class=\"age\"><a href=\"item?id=17018851\">13 hours ago</a></span> <span id=\"unv_17018851\"></span> | <a href=\"hide?id=17018851&amp;goto=news\">hide</a> | <a href=\"item?id=17018851\">33&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17023981'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">14.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17023981' href='vote?id=17023981&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://github.com/mozilla/global-sprint/milestone/1\" class=\"storylink\">Mozilla Global Sprint 2018</a><span class=\"sitebit comhead\"> (<a href=\"from?site=github.com\"><span class=\"sitestr\">github.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17023981\">178 points</span> by <a href=\"user?id=robterthaddeus\" class=\"hnuser\">robterthaddeus</a> <span class=\"age\"><a href=\"item?id=17023981\">7 hours ago</a></span> <span id=\"unv_17023981\"></span> | <a href=\"hide?id=17023981&amp;goto=news\">hide</a> | <a href=\"item?id=17023981\">24&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17026957'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">15.</span></td>      <td></td><td class=\"title\"><a href=\"https://medium.com/@blitzesports/blitz-esports-is-hiring-a-front-end-engineer-1485c5bb8f18\" class=\"storylink\" rel=\"nofollow\">Blitz  Esports (YC S15) is hiring a front end engineer – build apps for gamers</a><span class=\"sitebit comhead\"> (<a href=\"from?site=medium.com\"><span class=\"sitestr\">medium.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"age\"><a href=\"item?id=17026957\">1 hour ago</a></span> | <a href=\"hide?id=17026957&amp;goto=news\">hide</a>      </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17023632'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">16.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17023632' href='vote?id=17023632&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.atrium.co/blog/how-to-find-meet-investors/\" class=\"storylink\">How to Find Investors and Get Email Intros</a><span class=\"sitebit comhead\"> (<a href=\"from?site=atrium.co\"><span class=\"sitestr\">atrium.co</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17023632\">55 points</span> by <a href=\"user?id=meredithah\" class=\"hnuser\">meredithah</a> <span class=\"age\"><a href=\"item?id=17023632\">8 hours ago</a></span> <span id=\"unv_17023632\"></span> | <a href=\"hide?id=17023632&amp;goto=news\">hide</a> | <a href=\"item?id=17023632\">15&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17026150'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">17.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17026150' href='vote?id=17026150&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://blog.truthlabs.com/building-a-progressive-web-app-in-react-11c77a7fccb3\" class=\"storylink\">Building a Progressive Web App in React, using Firestore for offline support</a><span class=\"sitebit comhead\"> (<a href=\"from?site=truthlabs.com\"><span class=\"sitestr\">truthlabs.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17026150\">33 points</span> by <a href=\"user?id=sconstantinides\" class=\"hnuser\">sconstantinides</a> <span class=\"age\"><a href=\"item?id=17026150\">4 hours ago</a></span> <span id=\"unv_17026150\"></span> | <a href=\"hide?id=17026150&amp;goto=news\">hide</a> | <a href=\"item?id=17026150\">3&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17020226'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">18.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17020226' href='vote?id=17020226&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"http://www.kkuniyuk.com/Math119FakingData.pdf\" class=\"storylink\">The Difficulty of Faking Data (1999) [pdf]</a><span class=\"sitebit comhead\"> (<a href=\"from?site=kkuniyuk.com\"><span class=\"sitestr\">kkuniyuk.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17020226\">23 points</span> by <a href=\"user?id=tontonius\" class=\"hnuser\">tontonius</a> <span class=\"age\"><a href=\"item?id=17020226\">4 hours ago</a></span> <span id=\"unv_17020226\"></span> | <a href=\"hide?id=17020226&amp;goto=news\">hide</a> | <a href=\"item?id=17020226\">2&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17021518'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">19.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17021518' href='vote?id=17021518&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://blog.elcomsoft.com/2018/05/ios-11-4-to-disable-usb-port-after-7-days-what-it-means-for-mobile-forensics/\" class=\"storylink\">iOS 11.4 to Disable USB Port After 7 Days: What It Means for Mobile Forensics</a><span class=\"sitebit comhead\"> (<a href=\"from?site=elcomsoft.com\"><span class=\"sitestr\">elcomsoft.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17021518\">466 points</span> by <a href=\"user?id=Artemis2\" class=\"hnuser\">Artemis2</a> <span class=\"age\"><a href=\"item?id=17021518\">11 hours ago</a></span> <span id=\"unv_17021518\"></span> | <a href=\"hide?id=17021518&amp;goto=news\">hide</a> | <a href=\"item?id=17021518\">321&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17022695'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">20.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17022695' href='vote?id=17022695&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.blog.google/products/gmail/subject-write-emails-faster-smart-compose-gmail\" class=\"storylink\">Write Emails Faster with Smart Compose in Gmail</a><span class=\"sitebit comhead\"> (<a href=\"from?site=blog.google\"><span class=\"sitestr\">blog.google</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17022695\">153 points</span> by <a href=\"user?id=devhxinc\" class=\"hnuser\">devhxinc</a> <span class=\"age\"><a href=\"item?id=17022695\">9 hours ago</a></span> <span id=\"unv_17022695\"></span> | <a href=\"hide?id=17022695&amp;goto=news\">hide</a> | <a href=\"item?id=17022695\">158&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17021011'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">21.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17021011' href='vote?id=17021011&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.nytimes.com/2018/05/07/opinion/stewart-brand-hippie-silicon.html\" class=\"storylink\">Stewart Brand Changed the World, Twice</a><span class=\"sitebit comhead\"> (<a href=\"from?site=nytimes.com\"><span class=\"sitestr\">nytimes.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17021011\">75 points</span> by <a href=\"user?id=tysone\" class=\"hnuser\">tysone</a> <span class=\"age\"><a href=\"item?id=17021011\">13 hours ago</a></span> <span id=\"unv_17021011\"></span> | <a href=\"hide?id=17021011&amp;goto=news\">hide</a> | <a href=\"item?id=17021011\">16&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17023917'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">22.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17023917' href='vote?id=17023917&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://wellbeing.google/\" class=\"storylink\">Great technology should improve life, not distract from it</a><span class=\"sitebit comhead\"> (<a href=\"from?site=wellbeing.google\"><span class=\"sitestr\">wellbeing.google</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17023917\">149 points</span> by <a href=\"user?id=panarky\" class=\"hnuser\">panarky</a> <span class=\"age\"><a href=\"item?id=17023917\">7 hours ago</a></span> <span id=\"unv_17023917\"></span> | <a href=\"hide?id=17023917&amp;goto=news\">hide</a> | <a href=\"item?id=17023917\">82&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17023305'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">23.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17023305' href='vote?id=17023305&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://techcrunch.com/2018/05/08/googles-ml-kit-makes-it-easy-to-add-ai-smart-to-ios-and-android-apps/\" class=\"storylink\">Google’s ML Kit makes it easy to add AI smarts to iOS and Android apps</a><span class=\"sitebit comhead\"> (<a href=\"from?site=techcrunch.com\"><span class=\"sitestr\">techcrunch.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17023305\">171 points</span> by <a href=\"user?id=coloneltcb\" class=\"hnuser\">coloneltcb</a> <span class=\"age\"><a href=\"item?id=17023305\">8 hours ago</a></span> <span id=\"unv_17023305\"></span> | <a href=\"hide?id=17023305&amp;goto=news\">hide</a> | <a href=\"item?id=17023305\">21&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17023220'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">24.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17023220' href='vote?id=17023220&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"http://blog.mass.gov/masslawlib/legal-history/the-law-against-lying-and-false-news-in-colonial-massachusetts/\" class=\"storylink\">Fake news was illegal in 17th century colonial Massachusetts</a><span class=\"sitebit comhead\"> (<a href=\"from?site=mass.gov\"><span class=\"sitestr\">mass.gov</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17023220\">144 points</span> by <a href=\"user?id=jimschley\" class=\"hnuser\">jimschley</a> <span class=\"age\"><a href=\"item?id=17023220\">9 hours ago</a></span> <span id=\"unv_17023220\"></span> | <a href=\"hide?id=17023220&amp;goto=news\">hide</a> | <a href=\"item?id=17023220\">152&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17015661'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">25.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17015661' href='vote?id=17015661&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://byorgey.wordpress.com/2018/05/06/conversations-with-a-six-year-old-on-functional-programming/\" class=\"storylink\">Conversations with a six-year-old on functional programming</a><span class=\"sitebit comhead\"> (<a href=\"from?site=byorgey.wordpress.com\"><span class=\"sitestr\">byorgey.wordpress.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17015661\">1865 points</span> by <a href=\"user?id=weatherlight\" class=\"hnuser\">weatherlight</a> <span class=\"age\"><a href=\"item?id=17015661\">1 day ago</a></span> <span id=\"unv_17015661\"></span> | <a href=\"hide?id=17015661&amp;goto=news\">hide</a> | <a href=\"item?id=17015661\">266&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17022215'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">26.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17022215' href='vote?id=17022215&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.buzzfeed.com/nicolenguyen/amazon-fake-review-problem\" class=\"storylink\">Amazon’s Fake Review Economy</a><span class=\"sitebit comhead\"> (<a href=\"from?site=buzzfeed.com\"><span class=\"sitestr\">buzzfeed.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17022215\">224 points</span> by <a href=\"user?id=jonbaer\" class=\"hnuser\">jonbaer</a> <span class=\"age\"><a href=\"item?id=17022215\">10 hours ago</a></span> <span id=\"unv_17022215\"></span> | <a href=\"hide?id=17022215&amp;goto=news\">hide</a> | <a href=\"item?id=17022215\">99&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17021205'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">27.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17021205' href='vote?id=17021205&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.forbes.com/sites/bizcarson/2018/05/08/mapbox-maps-developers/#79431e78164d\" class=\"storylink\">How Mapbox Is Winning Over Developers to Challenge Google's Mapping Dominance</a><span class=\"sitebit comhead\"> (<a href=\"from?site=forbes.com\"><span class=\"sitestr\">forbes.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17021205\">289 points</span> by <a href=\"user?id=coloneltcb\" class=\"hnuser\">coloneltcb</a> <span class=\"age\"><a href=\"item?id=17021205\">12 hours ago</a></span> <span id=\"unv_17021205\"></span> | <a href=\"hide?id=17021205&amp;goto=news\">hide</a> | <a href=\"item?id=17021205\">95&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17025367'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">28.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17025367' href='vote?id=17025367&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://arxiv.org/abs/1805.01929\" class=\"storylink\">Superconducting Optoelectronic Neurons I: General Principles</a><span class=\"sitebit comhead\"> (<a href=\"from?site=arxiv.org\"><span class=\"sitestr\">arxiv.org</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17025367\">9 points</span> by <a href=\"user?id=indescions_2018\" class=\"hnuser\">indescions_2018</a> <span class=\"age\"><a href=\"item?id=17025367\">5 hours ago</a></span> <span id=\"unv_17025367\"></span> | <a href=\"hide?id=17025367&amp;goto=news\">hide</a> | <a href=\"item?id=17025367\">1&nbsp;comment</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17023102'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">29.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17023102' href='vote?id=17023102&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://blogs.msdn.microsoft.com/commandline/2018/05/08/extended-eol-in-notepad/\" class=\"storylink\">Introducing extended line endings support in Notepad</a><span class=\"sitebit comhead\"> (<a href=\"from?site=microsoft.com\"><span class=\"sitestr\">microsoft.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17023102\">242 points</span> by <a href=\"user?id=dEnigma\" class=\"hnuser\">dEnigma</a> <span class=\"age\"><a href=\"item?id=17023102\">9 hours ago</a></span> <span id=\"unv_17023102\"></span> | <a href=\"hide?id=17023102&amp;goto=news\">hide</a> | <a href=\"item?id=17023102\">143&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "                <tr class='athing' id='17026837'>\n",
      "      <td align=\"right\" valign=\"top\" class=\"title\"><span class=\"rank\">30.</span></td>      <td valign=\"top\" class=\"votelinks\"><center><a id='up_17026837' href='vote?id=17026837&amp;how=up&amp;goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class=\"title\"><a href=\"https://www.thedailybeast.com/defector-wikileaks-will-lie-to-your-face\" class=\"storylink\">Defector: WikiLeaks ‘Will Lie to Your Face’</a><span class=\"sitebit comhead\"> (<a href=\"from?site=thedailybeast.com\"><span class=\"sitestr\">thedailybeast.com</span></a>)</span></td></tr><tr><td colspan=\"2\"></td><td class=\"subtext\">\n",
      "        <span class=\"score\" id=\"score_17026837\">9 points</span> by <a href=\"user?id=eplanit\" class=\"hnuser\">eplanit</a> <span class=\"age\"><a href=\"item?id=17026837\">1 hour ago</a></span> <span id=\"unv_17026837\"></span> | <a href=\"hide?id=17026837&amp;goto=news\">hide</a> | <a href=\"item?id=17026837\">2&nbsp;comments</a>              </td></tr>\n",
      "      <tr class=\"spacer\" style=\"height:5px\"></tr>\n",
      "            <tr class=\"morespace\" style=\"height:10px\"></tr><tr><td colspan=\"2\"></td><td class=\"title\"><a href=\"news?p=2\" class=\"morelink\" rel=\"nofollow\">More</a></td></tr>\n",
      "  </table>\n",
      "</td></tr>\n",
      "<tr><td><img src=\"s.gif\" height=\"10\" width=\"0\"><table width=\"100%\" cellspacing=\"0\" cellpadding=\"1\"><tr><td bgcolor=\"#ff6600\"></td></tr></table><br><center><span class=\"yclinks\"><a href=\"newsguidelines.html\">Guidelines</a>\n",
      "        | <a href=\"newsfaq.html\">FAQ</a>\n",
      "        | <a href=\"mailto:hn@ycombinator.com\">Support</a>\n",
      "        | <a href=\"https://github.com/HackerNews/API\">API</a>\n",
      "        | <a href=\"security.html\">Security</a>\n",
      "        | <a href=\"lists\">Lists</a>\n",
      "        | <a href=\"bookmarklet.html\" rel=\"nofollow\">Bookmarklet</a>\n",
      "        | <a href=\"http://www.ycombinator.com/legal/\">Legal</a>\n",
      "        | <a href=\"http://www.ycombinator.com/apply/\">Apply to YC</a>\n",
      "        | <a href=\"mailto:hn@ycombinator.com\">Contact</a></span><br><br><form method=\"get\" action=\"//hn.algolia.com/\">Search:\n",
      "          <input type=\"text\" name=\"q\" value=\"\" size=\"17\" autocorrect=\"off\" spellcheck=\"false\" autocapitalize=\"off\" autocomplete=\"false\"></form>\n",
      "            </center></td></tr>\n",
      "      </table></center></body><script type='text/javascript' src='hn.js?IE5MesKJj5PfEtg18JAO'></script>\n",
      "  </html>\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "\n",
    "# Fetch a web page\n",
    "r = requests.get(\"https://news.ycombinator.com\")\n",
    "print(r.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "            \n",
      "          \n",
      "        Hacker News\n",
      "        \n",
      "                  Hacker News\n",
      "              new | comments | show | ask | jobs | submit            \n",
      "                              login\n",
      "                          \n",
      "              \n",
      "\n",
      "              \n",
      "      1.      Google Duplex: An AI System for Accomplishing Real World Tasks Over the Phone (googleblog.com)\n",
      "        1023 points by ivank 9 hours ago  | hide | 432&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      2.      A Short Introduction to the Art of Programming – Edsgar W. Dijkstra [pdf] (utexas.edu)\n",
      "        27 points by Rescis 1 hour ago  | hide | 4&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      3.      You can now run Linux apps on Chrome OS (techcrunch.com)\n",
      "        357 points by willsinclair 6 hours ago  | hide | 153&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      4.      The Cutting Room Floor: Unearthing Unused Content from Video Games (tcrf.net)\n",
      "        64 points by indescions_2018 5 hours ago  | hide | 7&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      5.      Eshell as a main shell (bitbucket.io)\n",
      "        21 points by taeric 3 hours ago  | hide | 1&nbsp;comment              \n",
      "      \n",
      "                \n",
      "      6.      Yubico and Microsoft Introduce Passwordless Login (yubico.com)\n",
      "        183 points by guitarbill 9 hours ago  | hide | 138&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      7.      LispKit: framework for Lisp-based extension/scripting languages for macOS apps (github.com)\n",
      "        41 points by ingve 4 hours ago  | hide | 3&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      8.      Dynimize: Speed Up MySQL with CPU Performance Virtualization (dynimize.com)\n",
      "        49 points by nwrk 5 hours ago  | hide | 13&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      9.      Android P (blog.google)\n",
      "        179 points by alanfranzoni 7 hours ago  | hide | 145&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      10.      ls | grep “echo ${data}” – Why/how does this work? (zerobin.net)\n",
      "        40 points by indigodaddy 5 hours ago  | hide | 28&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      11.      A Recycled IP Address Caused Me to Pirate Books by Accident (nickjanetakis.com)\n",
      "        213 points by nickjj 13 hours ago  | hide | 78&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      12.      Anarchists in the Spanish Civil War (2002) (isreview.org)\n",
      "        40 points by dgarceran 5 hours ago  | hide | 11&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      13.      A Plane That Accidentally Circumnavigated the World (2014) (medium.com)\n",
      "        180 points by SeoxyS 13 hours ago  | hide | 33&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      14.      Mozilla Global Sprint 2018 (github.com)\n",
      "        178 points by robterthaddeus 7 hours ago  | hide | 24&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      15.      Blitz  Esports (YC S15) is hiring a front end engineer – build apps for gamers (medium.com)\n",
      "        1 hour ago | hide      \n",
      "      \n",
      "                \n",
      "      16.      How to Find Investors and Get Email Intros (atrium.co)\n",
      "        55 points by meredithah 8 hours ago  | hide | 15&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      17.      Building a Progressive Web App in React, using Firestore for offline support (truthlabs.com)\n",
      "        33 points by sconstantinides 4 hours ago  | hide | 3&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      18.      The Difficulty of Faking Data (1999) [pdf] (kkuniyuk.com)\n",
      "        23 points by tontonius 4 hours ago  | hide | 2&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      19.      iOS 11.4 to Disable USB Port After 7 Days: What It Means for Mobile Forensics (elcomsoft.com)\n",
      "        466 points by Artemis2 11 hours ago  | hide | 321&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      20.      Write Emails Faster with Smart Compose in Gmail (blog.google)\n",
      "        153 points by devhxinc 9 hours ago  | hide | 158&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      21.      Stewart Brand Changed the World, Twice (nytimes.com)\n",
      "        75 points by tysone 13 hours ago  | hide | 16&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      22.      Great technology should improve life, not distract from it (wellbeing.google)\n",
      "        149 points by panarky 7 hours ago  | hide | 82&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      23.      Google’s ML Kit makes it easy to add AI smarts to iOS and Android apps (techcrunch.com)\n",
      "        171 points by coloneltcb 8 hours ago  | hide | 21&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      24.      Fake news was illegal in 17th century colonial Massachusetts (mass.gov)\n",
      "        144 points by jimschley 9 hours ago  | hide | 152&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      25.      Conversations with a six-year-old on functional programming (byorgey.wordpress.com)\n",
      "        1865 points by weatherlight 1 day ago  | hide | 266&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      26.      Amazon’s Fake Review Economy (buzzfeed.com)\n",
      "        224 points by jonbaer 10 hours ago  | hide | 99&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      27.      How Mapbox Is Winning Over Developers to Challenge Google's Mapping Dominance (forbes.com)\n",
      "        289 points by coloneltcb 12 hours ago  | hide | 95&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      28.      Superconducting Optoelectronic Neurons I: General Principles (arxiv.org)\n",
      "        9 points by indescions_2018 5 hours ago  | hide | 1&nbsp;comment              \n",
      "      \n",
      "                \n",
      "      29.      Introducing extended line endings support in Notepad (microsoft.com)\n",
      "        242 points by dEnigma 9 hours ago  | hide | 143&nbsp;comments              \n",
      "      \n",
      "                \n",
      "      30.      Defector: WikiLeaks ‘Will Lie to Your Face’ (thedailybeast.com)\n",
      "        9 points by eplanit 1 hour ago  | hide | 2&nbsp;comments              \n",
      "      \n",
      "            More\n",
      "  \n",
      "\n",
      "Guidelines\n",
      "        | FAQ\n",
      "        | Support\n",
      "        | API\n",
      "        | Security\n",
      "        | Lists\n",
      "        | Bookmarklet\n",
      "        | Legal\n",
      "        | Apply to YC\n",
      "        | ContactSearch:\n",
      "          \n",
      "            \n",
      "      \n",
      "  \n",
      "\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "# Remove HTML tags using RegEx\n",
    "pattern = re.compile(r'<.*?>')  # tags look like <...>\n",
    "print(pattern.sub('', r.text))  # replace them with blank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "            \n",
      "          \n",
      "        Hacker News\n",
      "        \n",
      "                  Hacker News\n",
      "              new | comments | show | ask | jobs | submit            \n",
      "                              login\n",
      "                          \n",
      "              \n",
      "\n",
      "              \n",
      "      1.      Google Duplex: An AI System for Accomplishing Real World Tasks Over the Phone (googleblog.com)\n",
      "        1023 points by ivank 9 hours ago  | hide | 432 comments              \n",
      "      \n",
      "                \n",
      "      2.      A Short Introduction to the Art of Programming – Edsgar W. Dijkstra [pdf] (utexas.edu)\n",
      "        27 points by Rescis 1 hour ago  | hide | 4 comments              \n",
      "      \n",
      "                \n",
      "      3.      You can now run Linux apps on Chrome OS (techcrunch.com)\n",
      "        357 points by willsinclair 6 hours ago  | hide | 153 comments              \n",
      "      \n",
      "                \n",
      "      4.      The Cutting Room Floor: Unearthing Unused Content from Video Games (tcrf.net)\n",
      "        64 points by indescions_2018 5 hours ago  | hide | 7 comments              \n",
      "      \n",
      "                \n",
      "      5.      Eshell as a main shell (bitbucket.io)\n",
      "        21 points by taeric 3 hours ago  | hide | 1 comment              \n",
      "      \n",
      "                \n",
      "      6.      Yubico and Microsoft Introduce Passwordless Login (yubico.com)\n",
      "        183 points by guitarbill 9 hours ago  | hide | 138 comments              \n",
      "      \n",
      "                \n",
      "      7.      LispKit: framework for Lisp-based extension/scripting languages for macOS apps (github.com)\n",
      "        41 points by ingve 4 hours ago  | hide | 3 comments              \n",
      "      \n",
      "                \n",
      "      8.      Dynimize: Speed Up MySQL with CPU Performance Virtualization (dynimize.com)\n",
      "        49 points by nwrk 5 hours ago  | hide | 13 comments              \n",
      "      \n",
      "                \n",
      "      9.      Android P (blog.google)\n",
      "        179 points by alanfranzoni 7 hours ago  | hide | 145 comments              \n",
      "      \n",
      "                \n",
      "      10.      ls | grep “echo ${data}” – Why/how does this work? (zerobin.net)\n",
      "        40 points by indigodaddy 5 hours ago  | hide | 28 comments              \n",
      "      \n",
      "                \n",
      "      11.      A Recycled IP Address Caused Me to Pirate Books by Accident (nickjanetakis.com)\n",
      "        213 points by nickjj 13 hours ago  | hide | 78 comments              \n",
      "      \n",
      "                \n",
      "      12.      Anarchists in the Spanish Civil War (2002) (isreview.org)\n",
      "        40 points by dgarceran 5 hours ago  | hide | 11 comments              \n",
      "      \n",
      "                \n",
      "      13.      A Plane That Accidentally Circumnavigated the World (2014) (medium.com)\n",
      "        180 points by SeoxyS 13 hours ago  | hide | 33 comments              \n",
      "      \n",
      "                \n",
      "      14.      Mozilla Global Sprint 2018 (github.com)\n",
      "        178 points by robterthaddeus 7 hours ago  | hide | 24 comments              \n",
      "      \n",
      "                \n",
      "      15.      Blitz  Esports (YC S15) is hiring a front end engineer – build apps for gamers (medium.com)\n",
      "        1 hour ago | hide      \n",
      "      \n",
      "                \n",
      "      16.      How to Find Investors and Get Email Intros (atrium.co)\n",
      "        55 points by meredithah 8 hours ago  | hide | 15 comments              \n",
      "      \n",
      "                \n",
      "      17.      Building a Progressive Web App in React, using Firestore for offline support (truthlabs.com)\n",
      "        33 points by sconstantinides 4 hours ago  | hide | 3 comments              \n",
      "      \n",
      "                \n",
      "      18.      The Difficulty of Faking Data (1999) [pdf] (kkuniyuk.com)\n",
      "        23 points by tontonius 4 hours ago  | hide | 2 comments              \n",
      "      \n",
      "                \n",
      "      19.      iOS 11.4 to Disable USB Port After 7 Days: What It Means for Mobile Forensics (elcomsoft.com)\n",
      "        466 points by Artemis2 11 hours ago  | hide | 321 comments              \n",
      "      \n",
      "                \n",
      "      20.      Write Emails Faster with Smart Compose in Gmail (blog.google)\n",
      "        153 points by devhxinc 9 hours ago  | hide | 158 comments              \n",
      "      \n",
      "                \n",
      "      21.      Stewart Brand Changed the World, Twice (nytimes.com)\n",
      "        75 points by tysone 13 hours ago  | hide | 16 comments              \n",
      "      \n",
      "                \n",
      "      22.      Great technology should improve life, not distract from it (wellbeing.google)\n",
      "        149 points by panarky 7 hours ago  | hide | 82 comments              \n",
      "      \n",
      "                \n",
      "      23.      Google’s ML Kit makes it easy to add AI smarts to iOS and Android apps (techcrunch.com)\n",
      "        171 points by coloneltcb 8 hours ago  | hide | 21 comments              \n",
      "      \n",
      "                \n",
      "      24.      Fake news was illegal in 17th century colonial Massachusetts (mass.gov)\n",
      "        144 points by jimschley 9 hours ago  | hide | 152 comments              \n",
      "      \n",
      "                \n",
      "      25.      Conversations with a six-year-old on functional programming (byorgey.wordpress.com)\n",
      "        1865 points by weatherlight 1 day ago  | hide | 266 comments              \n",
      "      \n",
      "                \n",
      "      26.      Amazon’s Fake Review Economy (buzzfeed.com)\n",
      "        224 points by jonbaer 10 hours ago  | hide | 99 comments              \n",
      "      \n",
      "                \n",
      "      27.      How Mapbox Is Winning Over Developers to Challenge Google's Mapping Dominance (forbes.com)\n",
      "        289 points by coloneltcb 12 hours ago  | hide | 95 comments              \n",
      "      \n",
      "                \n",
      "      28.      Superconducting Optoelectronic Neurons I: General Principles (arxiv.org)\n",
      "        9 points by indescions_2018 5 hours ago  | hide | 1 comment              \n",
      "      \n",
      "                \n",
      "      29.      Introducing extended line endings support in Notepad (microsoft.com)\n",
      "        242 points by dEnigma 9 hours ago  | hide | 143 comments              \n",
      "      \n",
      "                \n",
      "      30.      Defector: WikiLeaks ‘Will Lie to Your Face’ (thedailybeast.com)\n",
      "        9 points by eplanit 1 hour ago  | hide | 2 comments              \n",
      "      \n",
      "            More\n",
      "  \n",
      "\n",
      "Guidelines\n",
      "        | FAQ\n",
      "        | Support\n",
      "        | API\n",
      "        | Security\n",
      "        | Lists\n",
      "        | Bookmarklet\n",
      "        | Legal\n",
      "        | Apply to YC\n",
      "        | ContactSearch:\n",
      "          \n",
      "            \n",
      "      \n",
      "  \n",
      "\n"
     ]
    }
   ],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "\n",
    "# Remove HTML tags using Beautiful Soup library\n",
    "soup = BeautifulSoup(r.text, \"html5lib\")\n",
    "print(soup.get_text())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<tr class=\"athing\" id=\"17022963\">\n",
       "      <td align=\"right\" class=\"title\" valign=\"top\"><span class=\"rank\">1.</span></td>      <td class=\"votelinks\" valign=\"top\"><center><a href=\"vote?id=17022963&amp;how=up&amp;goto=news\" id=\"up_17022963\"><div class=\"votearrow\" title=\"upvote\"></div></a></center></td><td class=\"title\"><a class=\"storylink\" href=\"https://ai.googleblog.com/2018/05/duplex-ai-system-for-natural-conversation.html\">Google Duplex: An AI System for Accomplishing Real World Tasks Over the Phone</a><span class=\"sitebit comhead\"> (<a href=\"from?site=googleblog.com\"><span class=\"sitestr\">googleblog.com</span></a>)</span></td></tr>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Find all articles\n",
    "summaries = soup.find_all(\"tr\", class_=\"athing\")\n",
    "summaries[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Google Duplex: An AI System for Accomplishing Real World Tasks Over the Phone'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Extract title\n",
    "summaries[0].find(\"a\", class_=\"storylink\").get_text().strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "30 Article summaries found. Sample:\n",
      "Google Duplex: An AI System for Accomplishing Real World Tasks Over the Phone\n"
     ]
    }
   ],
   "source": [
    "# Find all articles, extract titles\n",
    "articles = []\n",
    "summaries = soup.find_all(\"tr\", class_=\"athing\")\n",
    "for summary in summaries:\n",
    "    title = summary.find(\"a\", class_=\"storylink\").get_text().strip()\n",
    "    articles.append((title))\n",
    "\n",
    "print(len(articles), \"Article summaries found. Sample:\")\n",
    "print(articles[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Normalization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Case Normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?\n"
     ]
    }
   ],
   "source": [
    "# Sample text\n",
    "text = \"The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?\"\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the first time you see the second renaissance it may look boring. look at it at least twice and definitely watch part 2. it will change your view of the matrix. are the human people the ones who started the war ? is ai a bad thing ?\n"
     ]
    }
   ],
   "source": [
    "# Convert to lowercase\n",
    "text = text.lower() \n",
    "print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Punctuation Removal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the first time you see the second renaissance it may look boring  look at it at least twice and definitely watch part 2  it will change your view of the matrix  are the human people the ones who started the war   is ai a bad thing  \n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "# Remove punctuation characters\n",
    "text = re.sub(r\"[^a-zA-Z0-9]\", \" \", text) \n",
    "print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['the', 'first', 'time', 'you', 'see', 'the', 'second', 'renaissance', 'it', 'may', 'look', 'boring', 'look', 'at', 'it', 'at', 'least', 'twice', 'and', 'definitely', 'watch', 'part', '2', 'it', 'will', 'change', 'your', 'view', 'of', 'the', 'matrix', 'are', 'the', 'human', 'people', 'the', 'ones', 'who', 'started', 'the', 'war', 'is', 'ai', 'a', 'bad', 'thing']\n"
     ]
    }
   ],
   "source": [
    "# Split text into tokens (words)\n",
    "words = text.split()\n",
    "print(words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### NLTK: Natural Language ToolKit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import nltk\n",
    "nltk.data.path.append(os.path.join(os.getcwd(), \"nltk_data\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers.\n"
     ]
    }
   ],
   "source": [
    "# Another sample text\n",
    "text = \"Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers.\"\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Dr.', 'Smith', 'graduated', 'from', 'the', 'University', 'of', 'Washington', '.', 'He', 'later', 'started', 'an', 'analytics', 'firm', 'called', 'Lux', ',', 'which', 'catered', 'to', 'enterprise', 'customers', '.']\n"
     ]
    }
   ],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "\n",
    "# Split text into words using NLTK\n",
    "words = word_tokenize(text)\n",
    "print(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Dr. Smith graduated from the University of Washington.', 'He later started an analytics firm called Lux, which catered to enterprise customers.']\n"
     ]
    }
   ],
   "source": [
    "from nltk.tokenize import sent_tokenize\n",
    "\n",
    "# Split text into sentences\n",
    "sentences = sent_tokenize(text)\n",
    "print(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n"
     ]
    }
   ],
   "source": [
    "# List stop words\n",
    "from nltk.corpus import stopwords\n",
    "print(stopwords.words(\"english\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['the', 'first', 'time', 'you', 'see', 'the', 'second', 'renaissance', 'it', 'may', 'look', 'boring', 'look', 'at', 'it', 'at', 'least', 'twice', 'and', 'definitely', 'watch', 'part', '2', 'it', 'will', 'change', 'your', 'view', 'of', 'the', 'matrix', 'are', 'the', 'human', 'people', 'the', 'ones', 'who', 'started', 'the', 'war', 'is', 'ai', 'a', 'bad', 'thing']\n"
     ]
    }
   ],
   "source": [
    "# Reset text\n",
    "text = \"The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?\"\n",
    "\n",
    "# Normalize it\n",
    "text = re.sub(r\"[^a-zA-Z0-9]\", \" \", text.lower())\n",
    "\n",
    "# Tokenize it\n",
    "words = text.split()\n",
    "print(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'boring', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'ones', 'started', 'war', 'ai', 'bad', 'thing']\n"
     ]
    }
   ],
   "source": [
    "# Remove stop words\n",
    "words = [w for w in words if w not in stopwords.words(\"english\")]\n",
    "print(words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sentence Parsing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  (NP I)\n",
      "  (VP\n",
      "    (VP (V shot) (NP (Det an) (N elephant)))\n",
      "    (PP (P in) (NP (Det my) (N pajamas)))))\n",
      "(S\n",
      "  (NP I)\n",
      "  (VP\n",
      "    (V shot)\n",
      "    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "\n",
    "# Define a custom grammar\n",
    "my_grammar = nltk.CFG.fromstring(\"\"\"\n",
    "S -> NP VP\n",
    "PP -> P NP\n",
    "NP -> Det N | Det N PP | 'I'\n",
    "VP -> V NP | VP PP\n",
    "Det -> 'an' | 'my'\n",
    "N -> 'elephant' | 'pajamas'\n",
    "V -> 'shot'\n",
    "P -> 'in'\n",
    "\"\"\")\n",
    "parser = nltk.ChartParser(my_grammar)\n",
    "\n",
    "# Parse a sentence\n",
    "sentence = word_tokenize(\"I shot an elephant in my pajamas\")\n",
    "for tree in parser.parse(sentence):\n",
    "    print(tree)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stemming & Lemmatization\n",
    "\n",
    "### Stemming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['first', 'time', 'see', 'second', 'renaiss', 'may', 'look', 'bore', 'look', 'least', 'twice', 'definit', 'watch', 'part', '2', 'chang', 'view', 'matrix', 'human', 'peopl', 'one', 'start', 'war', 'ai', 'bad', 'thing']\n"
     ]
    }
   ],
   "source": [
    "from nltk.stem.porter import PorterStemmer\n",
    "\n",
    "# Reduce words to their stems\n",
    "stemmed = [PorterStemmer().stem(w) for w in words]\n",
    "print(stemmed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lemmatization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'boring', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'one', 'started', 'war', 'ai', 'bad', 'thing']\n"
     ]
    }
   ],
   "source": [
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "\n",
    "# Reduce words to their root form\n",
    "lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]\n",
    "print(lemmed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'bore', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'one', 'start', 'war', 'ai', 'bad', 'thing']\n"
     ]
    }
   ],
   "source": [
    "# Lemmatize verbs by specifying pos\n",
    "lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in lemmed]\n",
    "print(lemmed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
